#delimit;
clear all;
set more off;

capture log using "01_Prepare_Guidestar_Basic.log", replace;
/*******************************************************************************************/;
** INPUT: Guidestar 2013-2018 DAF data [sent by Tim Reifschneider (originally accessed February 4, 2021)]; 
** OUTPUT: Clean data set with tax-based DAF info; 
** LAST MODIFIED: JAG, 7/9/2021;
** RUN TIME: 10 minutes using Stata 15 on computer with 2 x 2.60GHz processors and 64GB RAM;  													      	
/*******************************************************************************************/;

** NOTE: I PURPOSEFULLY READ IN AS ALL STRING SINCE SOME OBSERVATIONS SHOWED UP AS EXPONENTIAL OR WITHOUT LEAD ZERO;
** PERSONNEL DATA (I.E. EMPLOYEE DATA);
import excel "../Raw Data/Guidestar/DukeUniversity_202102.xlsx", sheet("Part VII-A") firstrow allstring clear;
rename *, lower;
count if ein=="";
gen year=real(fye);
replace title=lower(trim(title));
replace name=lower(trim(name));
foreach var in hrw comp_org comp_other {;
	gen double temp=real(`var');
	drop `var';
	rename temp `var';
};
rename comp_other comp_otr;
gen comp_tot=comp_org+comp_otr;

** WORKING ON NAME VARIABLES AS WANT TO IDENTIFY DAFs WHERE FAMILY MEMBERS OR OTHER BAD CORP GOV EXIST;
gen orig_name=name;
gen i1=regexm(name," - ");
replace name=substr(name,1,i1) if i1!=0;
drop i1;
replace name=lower(trim(name));
gen first3=substr(name,1,3);
gen first4=substr(name,1,4);
replace name=substr(name, 4,length(name)) if first3=="mr ";
replace name=substr(name, 4,length(name)) if first3=="ms ";
replace name=substr(name, 4,length(name)) if first3=="dr ";
replace name=substr(name, 5,length(name)) if first4=="mrs ";
replace name=substr(name, 5,length(name)) if first4=="rev ";
replace name=substr(name, 5,length(name)) if first4=="hon ";
replace name=regexr(name," jr","");
replace name=regexr(name," nmf","");
replace name=regexr(name," nmg","");
replace name=regexr(name," nmhc","");
replace name=regexr(name," nlfh","");
replace name=regexr(name," nmwr","");
replace name=regexr(name," chf","");
replace name=regexr(name," cpg","");
replace name=regexr(name," hfi","");
replace name=regexr(name," sr","");
replace name=regexr(name," iii","");
replace name=regexr(name," ii","");
replace name=regexr(name," phd","");
replace name=regexr(name," md","");
replace name=regexr(name," m\.d\.","");
replace name=regexr(name," ph\.d\.","");
replace name=regexr(name," cpa","");
replace name=regexr(name," mba","");
replace name=regexr(name," edd","");
replace name=regexr(name," esq","");
replace name=regexr(name," jd","");

** Trying to get rid of middle names this way;
replace name=regexr(name," a "," ");
replace name=regexr(name," b "," ");
replace name=regexr(name," c "," ");
replace name=regexr(name," d "," ");
replace name=regexr(name," e "," ");
replace name=regexr(name," f "," ");
replace name=regexr(name," g "," ");
replace name=regexr(name," h "," ");
replace name=regexr(name," i "," ");
replace name=regexr(name," j "," ");
replace name=regexr(name," k "," ");
replace name=regexr(name," l "," ");
replace name=regexr(name," m "," ");
replace name=regexr(name," n "," ");
replace name=regexr(name," o "," ");
replace name=regexr(name," p "," ");
replace name=regexr(name," q "," ");
replace name=regexr(name," r "," ");
replace name=regexr(name," s "," ");
replace name=regexr(name," t "," ");
replace name=regexr(name," u "," ");
replace name=regexr(name," v "," ");
replace name=regexr(name," w "," ");
replace name=regexr(name," x "," ");
replace name=regexr(name," y "," ");
replace name=regexr(name," z "," ");

replace name=regexr(name," a\. "," ");
replace name=regexr(name," b\. "," ");
replace name=regexr(name," c\. "," ");
replace name=regexr(name," d\. "," ");
replace name=regexr(name," e\. "," ");
replace name=regexr(name," f\. "," ");
replace name=regexr(name," g\. "," ");
replace name=regexr(name," h\. "," ");
replace name=regexr(name," i\. "," ");
replace name=regexr(name," j\. "," ");
replace name=regexr(name," k\. "," ");
replace name=regexr(name," l\. "," ");
replace name=regexr(name," m\. "," ");
replace name=regexr(name," n\. "," ");
replace name=regexr(name," o\. "," ");
replace name=regexr(name," p\. "," ");
replace name=regexr(name," q\. "," ");
replace name=regexr(name," r\. "," ");
replace name=regexr(name," s\. "," ");
replace name=regexr(name," t\. "," ");
replace name=regexr(name," u\. "," ");
replace name=regexr(name," v\. "," ");
replace name=regexr(name," w\. "," ");
replace name=regexr(name," x\. "," ");
replace name=regexr(name," y\. "," ");
replace name=regexr(name," z\. "," ");

replace name=regexr(name," a\."," ");
replace name=regexr(name," b\."," ");
replace name=regexr(name," c\."," ");
replace name=regexr(name," d\."," ");
replace name=regexr(name," e\."," ");
replace name=regexr(name," f\."," ");
replace name=regexr(name," g\."," ");
replace name=regexr(name," h\."," ");
replace name=regexr(name," i\."," ");
replace name=regexr(name," j\."," ");
replace name=regexr(name," k\."," ");
replace name=regexr(name," l\."," ");
replace name=regexr(name," m\."," ");
replace name=regexr(name," n\."," ");
replace name=regexr(name," o\."," ");
replace name=regexr(name," p\."," ");
replace name=regexr(name," q\."," ");
replace name=regexr(name," r\."," ");
replace name=regexr(name," s\."," ");
replace name=regexr(name," t\."," ");
replace name=regexr(name," u\."," ");
replace name=regexr(name," v\."," ");
replace name=regexr(name," w\."," ");
replace name=regexr(name," x\."," ");
replace name=regexr(name," y\."," ");
replace name=regexr(name," z\."," ");
replace name=lower(trim(name));

** Remove comma based end suffixes at end;
gen i1=index(name,",");
gen l=length(name);
gen dif=l-i1;
replace dif=. if i1==0;
replace name=substr(name,1,i1-1) if abs(dif<=3);
drop i1 l dif;
replace name=lower(trim(name));

** Remove some end suffixes;
gen l=length(name);
gen last3=substr(name,l-3,l);
gen last4=substr(name,l-4,l);
replace name=substr(name,1,l-4) if last3=="chf";
replace name=substr(name,1,l-4) if last3=="cpg";
replace name=substr(name,1,l-4) if last3=="hfi";
replace name=substr(name,1,l-4) if last3=="nmf";
replace name=substr(name,1,l-4) if last3=="dds";
replace name=substr(name,1,l-4) if last3=="csc";
replace name=substr(name,1,l-3) if last3==" sj";
replace name=substr(name,1,l-3) if last3==" iv";
replace name=substr(name,1,l-3) if last3==" rn";
replace name=substr(name,1,l-3) if last3==" od";
replace name=substr(name,1,l-3) if last3==" do";
replace name=substr(name,1,l-3) if last3==" ma";
replace name=substr(name,1,l-3) if last3==" ms";
replace name=substr(name,1,l-3) if last3==" cw";
replace name=substr(name,1,l-3) if last3==" zl";
replace name=substr(name,1,l-5) if last4=="nmhc";
replace name=substr(name,1,l-5) if last4=="nmpg";
replace name=substr(name,1,l-5) if last4=="nlfh";
replace name=substr(name,1,l-7) if last4=="h.d.";
drop l last3 last4;

** Finally get first and last name to identify families;
gen i1=index(name," ");
gen first_name=trim(substr(name,1,i1));
gen s1=index(first_name,",");
replace first_name=substr(first_name,1,s1-1) if s1!=0;
gen last_name=trim(substr(name,i1+1,length(name)));
drop i1;
gen i1=index(last_name," ");
gen alt_last_name=trim(substr(last_name,i1+1,length(last_name))) if i1!=.;
gen middle_name=trim(substr(last_name,1,i1));
br if orig_name=="poss, mary e.";
gen temp=first_name;
replace first_name=alt_last_name if s1!=0;
replace alt_last_name=temp if s1!=0;
drop s1 temp;

** some non-profits appear to have only put initials for last name so ignoring those;
replace alt_last_name="" if length(alt_last_name)<2;
** these are people where first and last are switched;
gen switch=1 if middle_name!="" & alt_last_name=="";
gen temp=first_name if switch==1;
replace first_name=middle_name if switch==1;
replace alt_last_name=temp if switch==1;
drop last_name;
rename alt_last_name last_name;
bys ein fye last_name: gen dup=_N;
gen family=(dup>1 & dup<8);
replace family=0 if regexm(orig_name,"volunteer");
replace family=0 if regexm(orig_name,"vacant");
replace family=0 if regexm(orig_name,"pnc");
drop dup;
bys ein fye: egen tot_family=total(family);
gsort -tot_family;
save "Employee Detailed.dta", replace;
gen tot_emp=1;
gen comp_tot_tot=comp_tot;
gen comp_tot_fam=comp_tot if family==1;
collapse (max) family tot_family (sum) tot_emp comp_tot_tot comp_tot_fam (mean) comp_org comp_tot comp_otr, by(ein year);
rename comp_org avg_emp_comp_base; 
rename comp_tot avg_emp_comp_tot;
rename comp_otr avg_emp_comp_otr;
gen pct_fam_comp=comp_tot_fam/comp_tot_tot;
replace pct_fam_comp=0 if family==0;
save "Employee Collapsed.dta", replace;


** CONTRACTOR DATA;
import excel "../Raw Data/Guidestar/DukeUniversity_202102.xlsx", sheet("Part VII-B") firstrow allstring clear;
rename *, lower;
count if ein=="";
gen year=real(fye);
replace service=lower(trim(service));
tab service, m;
gen investor=regexm(service,"invest");
tab investor, m;
gen consulting=regexm(service,"consult");
gen data=regexm(service,"data");
replace data=regexm(service,"statist");
gen fundraising=regexm(service,"fundrais");
gen double contr_comp=real(comp);
drop comp;
sum contr_comp, d;
sum contr_comp if investor==1;
save "Contractors Detailed.dta", replace;
gen contr_count=1;
gen avg_contr_comp=contr_comp;
collapse (sum) contr_count consulting data fundraising investor contr_comp (mean) avg_contr_comp, by(ein year);
save "Contractors Collapsed.dta", replace;

** SECURITIES DATA (will need to be filled in);
import excel "../Raw Data/Guidestar/DukeUniversity_202102.xlsx", sheet("Sched D, Part VII") firstrow allstring clear;
rename *, lower;
count if ein=="";
gen year=real(fye);
bys record_id fye: gen tot_sec=_N;
save "Securities Detail.dta", replace;
drop tot_sec;
gen tot_sec=1;
collapse (sum) tot_sec, by (record_id year);
save "Securities Collapsed.dta", replace;

** INVESTMENTS DATA (will need to be filled in);
import excel "../Raw Data/Guidestar/DukeUniversity_202102.xlsx", sheet("Sched D, Part VIII") firstrow allstring clear;
rename *, lower;
gen year=real(fye);
bys record_id fye: gen tot_inv=_N;
save "Investments Detail.dta", replace;
drop tot_inv;
gen tot_inv=1;
collapse (sum) tot_inv, by (record_id year);
save "Investments Collapsed.dta", replace;


** DOMESTIC GRANTS DATA;
import delimited "../Raw Data/Guidestar/dom_grant_1.csv", asdouble stringcols(3 10) clear;
save "t1.dta", replace;
import delimited "../Raw Data/Guidestar/dom_grant_2.csv", asdouble stringcols(3 10) clear;
append using "t1.dta";
save "t12.dta", replace;
use "t12.dta", clear;
count if ein=="";
rename fye year;
rename org_cash dom_grant_amt;
replace dom_grant_amt=org_non_cash if dom_grant_amt==. & org_non_cash!=.;
** We have purpose so we have the potential to do a lot with purpose and or region;
** For now though, just going to get basic statistics per year;
** We also have grantee_ein;
replace grant_purpose=lower(trim(grant_purpose));
gen dom_grant_edu=regexm(grant_purpose,"educ");
gen dom_grant_health=regexm(grant_purpose,"health");
replace dom_grant_health=1 if regexm(grant_purpose,"well");
gen dom_grant_lgbtq=1 if regexm(grant_purpose,"lgbtq");
save "Dom Grants Detail.dta", replace;
gen sd_dom_grant_amt=dom_grant_amt;
gen max_dom_grant_amt=dom_grant_amt;
gen min_dom_grant_amt=dom_grant_amt;
gen num_dom_grants=1;
gen num_dom_grants_w_amt=1 if dom_grant_amt!=.;
gen med_dom_grant_amt=dom_grant_amt;
gen sum_dom_grant_amt=dom_grant_amt;
collapse (sum) num_dom_grants_w_amt sum_dom_grant_amt num_dom_grants (mean) dom_grant_amt (median) med_dom_grant_amt (sd) sd_dom_grant_amt (max) max_dom_grant_amt (min) min_dom_grant_amt, by(record_id year);
rename dom_grant_amt mean_dom_grant_amt;
save "Dom Grants Collapsed.dta", replace;


** INTL GRANT DATA;
import excel "../Raw Data/Guidestar/DukeUniversity_202102.xlsx", sheet("Sched F, Part II") firstrow allstring clear;
rename *, lower;
count if ein=="";
gen year=real(fye);
gen double intl_grant_amt=real(cash_grant);
gen double temp=real(disburse_manner);
replace intl_grant_amt=temp if intl_grant_amt==. & temp!=.;
drop temp;
** We have purpose so we have the potential to do a lot with purpose and or region;
** For now though, just going to get basic statistics per year;
** We also have grantee_ein;
replace purpose=lower(trim(purpose));
gen intl_grant_edu=regexm(purpose,"educ");
gen intl_grant_health=regexm(purpose,"health");
replace intl_grant_health=1 if regexm(purpose,"well");
save "Intl Grants Detail.dta", replace;
gen sd_intl_grant_amt=intl_grant_amt;
gen max_intl_grant_amt=intl_grant_amt;
gen min_intl_grant_amt=intl_grant_amt;
gen num_intl_grants=1;
gen num_intl_grants_w_amt=1 if intl_grant_amt!=.;
gen med_intl_grant_amt=intl_grant_amt;
collapse (sum) num_intl_grants_w_amt num_intl_grants (mean) intl_grant_amt (median) med_intl_grant_amt (sd) sd_intl_grant_amt (max) max_intl_grant_amt (min) min_intl_grant_amt, by(record_id year);
rename intl_grant_amt mean_intl_grant_amt;
save "Intl Grants Collapsed.dta", replace;

** OFFICER DATA;
import excel "../Raw Data/Guidestar/DukeUniversity_202102.xlsx", sheet("Sched J, Part II") firstrow allstring clear;
rename *, lower;
count if ein=="";
gen year=real(fye);
foreach var in base_comp total_benefits {;
	gen double temp=real(`var');
	format temp %12.0f;
	drop `var';
	rename temp `var';
};
gen fringe_officer_benefit=(total_benefits/base_comp)-1;
gen tot_comp=base_comp;
gen tot_officers=1;
save "Officers Detail.dta", replace;
collapse (mean) base_comp fringe_officer_benefit (sum) tot_comp tot_officers, by(record_id year);
save "Officers Collapsed.dta", replace;

** ADDITIONAL ECOYSTEM (i.e., RELATED ORGs) DATA;
import excel "../Raw Data/Guidestar/DukeUniversity_202102.xlsx", sheet("Sched R, Part II") firstrow allstring clear;
rename *, lower;
count if ein=="";
gen year=real(fye);
save "Related Org Detail.dta", replace;
gen tot_rel_org=1;
collapse (sum) tot_rel_org, by(ein year);
save "Related Org Collapsed.dta", replace;


** MAIN DATA WITH FINANCIALS;
import excel "../Raw Data/Guidestar/DukeUniversity_202102.xlsx", sheet("Main") firstrow allstring clear;
rename *, lower;
gen year=real(fye);
** THIS IS IMPACTASSETS WHICH I LOOK AT TO UNDERSTAND FILES;
*keep if ein=="26-2048480";
*keep if year==2018;

replace zip=trim(zip);
gen temp=substr(zip,1,5);
rename zip zip_long;
rename temp zipcode;
gen double found_year=real(formation_year);
tab found_year, m;
** Converting to real numbers, need to go back and check that dollars or other things aren't throwing off;
foreach var in all_year_employees total_officers voting_members total_comp_org total_comp_other total_contractors 
membership_dues contrib_other contrib_noncash contrib_all service_revenue income_investment income_bonds 
gross_securities gross_other cost_securities cost_other gain_securities gain_other net_sales revenue_total 
grants_orgs_us grants_individuals_us grants_orgs_non_us benefits_total compensation_current compensation_disqualified 
wages_total pension_total employee_benefits_total payroll_tax_total fees_management fees_legal accounting_fees_total 
fees_lobbying fundraising_total fees_investments fees_other ads_promo info_tech meetings_total depreciation_total 
expense_a_total expense_b_total expense_c_total expense_d_total expense_e_total expenses_total 
cash_boy cash_eoy savings_boy savings_eoy pledges_grants_boy pledges_grants_eoy public_securities_boy 
public_securities_eoy other_securities_boy other_securities_eoy assets_total_boy assets_total_eoy 
grants_payable_boy grants_payable_eoy liability_total_boy liability_total_eoy unrestricted_boy unrestricted_eoy 
perm_restricted_boy perm_restricted_eoy net_total_boy net_total_eoy balances_boy balances_eoy net_asset_change 
number_daf number_accounts contrib_donor_funds schd_contrib_other grants_daf grants_other total_daf_value 
total_accounts_value tot_c3 tot_other contrib_all_prior service_revenue_prior income_investment_prior 
revenue_other_prior tot_rev_prior tot_rev_current grants_prior benefits_total_prior comp_benefits_prior 
fundraising_total_prior other_expenses_prior expenses_total_prior rev_exp_prior receipt_tot prog_serv_exp_total 
tot_ubr net_ubr_990t {;
	gen double temp=real(`var');
	format temp %12.0f;
	drop `var';
	rename temp `var';
};
replace orgname=lower(trim(orgname));
gen school=regexm(orgname,"university");
replace school=1 if regexm(orgname,"college") & school==0;
tab school, m;
gen medical=regexm(orgname,"hospital");
replace medical=1 if regexm(orgname,"medical") & medical==0;
tab medical, m;
preserve;
keep if school==1|medical==1;
save "Eds and Meds DAFs.dta", replace; 
restore;
*drop if school==1|medical==1;
drop school medical;
tab year, m;
save "Main.dta", replace;

** THIS PART MATCHES THE SECURITY TAB TO THE PDF LINKS THAT WE WILL NEED TO DOWNLOAD TO GET VALUES;
use "Securities Collapsed.dta", clear;
merge 1:1 record_id year using "Main.dta";
keep if _m==3;
*br record_id year tot_sec ein profile_link form_link orgname;
** NOTE: the browse above is what I sent to the RAs to download PDFs and digitize actual investments;

/*
use "Securities Detail.dta", clear;
merge m:1 record_id year using "Main.dta";
keep if _m==3;
keep record_id year ein profile_link form_link orgname fye record_id description_security method_eoy_market_value year;
order record_id year ein profile_link form_link orgname fye record_id description_security method_eoy_market_value year;
gsort ein -year;
** This is the excel file that will need to be filled in for handcode;
export excel record_id year ein form_link profile_link orgname fye description_security method_eoy_market_value using "For Handcode/Securites Handcode.xls", firstrow(variables) nolabel replace;

use "Investments Collapsed.dta", clear;
merge 1:1 record_id year using "Main.dta";
keep if _m==3;
*br record_id year tot_inv ein profile_link form_link orgname;
*/

** THIS GENERATES BASIC FINANCIAL INFORMATION;
use "Main.dta", clear;
order ein orgname year zipcode;
gen contr_gr=(contrib_all-contrib_all_prior)/contrib_all_prior;
gen prog_rev_gr=(service_revenue-service_revenue_prior)/service_revenue_prior;
gen inv_rev_gr=(income_investment-income_investment_prior)/income_investment_prior;
gen tot_rev_gr=(tot_rev_current-tot_rev_prior)/tot_rev_prior;
gen at_gr=(assets_total_eoy-assets_total_boy)/(assets_total_boy);
label var tot_rev_current "Sum of all income sources received in year";
gen exp_gr=(expenses_total-expenses_total_prior)/expenses_total_prior;
label var tot_ubr "Total unrelated business revenue Part I, 7a";
label var contr_gr "Contributions and grants growth";

** What are investment management fees?;
** What is total fee structure?;
gen tot_fees=fees_management+fees_legal+accounting_fees_total+fees_lobbying+fundraising_total+fees_investments+fees_other;
** NOTE: Expenses include program-related expenses (i.e., grants) so this comparison isn't great;
gen fees2exp=tot_fees/expenses_total;

** Unrestricted surplus or deficit;
gen surplus=unrestricted_eoy-unrestricted_boy;
gen surplus_pre_depr=unrestricted_eoy-unrestricted_boy+depreciation_total;

** These are investments in public and other securities;
rename public_securities_eoy pub_sec;
gen pub_sec_gr=(pub_sec-public_securities_boy)/(public_securities_boy);
rename other_securities_eoy otr_sec;
gen otr_sec_gr=(otr_sec-other_securities_boy)/(other_securities_boy);
gen inv_sec=pub_sec+otr_sec;
gen inv_sec_boy=public_securities_boy+other_securities_boy;
gen inv_sec_gr=(inv_sec-inv_sec_boy)/(inv_sec_boy);
gen inv2at=inv_sec/assets_total_eoy;
gen invfees2inv=fees_investments/inv_sec;
gen invfees2at=fees_investments/assets_total_eoy;

** This is cash vs other assets contributed;
gen cash_contrib=contrib_all-(contrib_noncash+contrib_other);
gen cash2allcontrib=cash_contrib/contrib_all;
** How much is commission and loan loss?;
gen loan_loss=0;
gen commission=0;
foreach var in expense_a expense_b expense_c expense_d {;
	replace `var'_desc=lower(trim(`var'_desc));
	replace loan_loss=`var'_total if regexm(`var'_desc,"loan loss") & loan_loss==.;
	replace commission=`var'_total if regexm(`var'_desc,"commission") & commission==.;
	};
	
** WANT TO ADD INFORMATION ABOUT GRANTS AND VARIANCE OF GRANTS, INTL vs. DOMESTIC MIX, WITHIN CERTAIN RADIUS OF HQ;
merge 1:1 record_id year using "Intl Grants Collapsed.dta";
gen no_intl_grants=(_m==1);
tab no_intl_grants, m;
drop _m;
merge 1:1 record_id year using "Dom Grants Collapsed.dta";
gen no_dom_grants=(_m==1);
tab no_dom_grants, m;
drop _m;
merge 1:1 record_id year using "Officers Collapsed.dta";
gen no_officers=(_m==1);
tab no_officers, m;
drop _m;
merge 1:1 ein year using "Contractors Collapsed.dta";
gen no_contractors=(_m==1);
tab no_contractors, m;
drop _m;
merge 1:1 ein year using "Employee Collapsed.dta";
gen no_emp=(_m==1);
tab no_emp, m;
drop _m;
merge 1:1 ein year using "Related Org Collapsed.dta";
gen no_rel_org=(_m==1);
tab no_rel_org, m;
drop _m;
merge 1:1 record_id year using "Securities Collapsed.dta";
gen no_sec_det=(_m==1);
tab no_sec_det, m;
drop _m;
merge 1:1 record_id year using "Investments Collapsed.dta";
gen no_inv_det=(_m==1);
tab no_inv_det, m;
drop _m;
** MISC cleaning up of main control variables;
gen daf_size=log(1+assets_total_boy);
gen temp=real(formation_year);
rename formation_year str_formation_year;
rename temp formation_year;
gen daf_age=abs(year-formation_year);
sum daf_age, d;
br str_formation_year if formation_year==.;
egen mean_daf_age=mean(daf_age);
gen daf_age_filled=daf_age;
replace daf_age_filled=mean_daf_age if daf_age==.;
save "Main with additional details.dta", replace;

log close;
